# Match Norwegian SIC industry descriptions to O*NET occupation titles
# using sentence-transformer embeddings (cosine similarity).
#
# This is step 1 of the industry-to-O*NET crosswalk. Step 2 is
# occ_crosswalk.R, which joins the matches with O*NET work context data.
#
# Uses:
#   data/admin/external/sic_norway_old.csv     SIC codes/descriptions (pre-2008)
#   data/admin/external/sic_norway_new.csv     SIC codes/descriptions (2008+)
#   data/onet/Occupation Data.txt         O*NET occupation titles/descriptions
#
# Creates:
#   data/admin/external/industry_with_matches.csv  Each SIC code matched to best O*NET occupation

from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import os

# --- paths ---
project_root = os.path.join(os.path.dirname(__file__), "..", "..", "..")
onet_path = os.path.join(project_root, "data", "onet")

# load O*NET occupation data
onet_df = pd.read_csv(
    os.path.join(onet_path, "Occupation Data.txt"), delimiter="\t"
)

# load Norwegian SIC industry descriptions (old and new classification)
industry_old = pd.read_csv(
    os.path.join(project_root, "data", "admin", "external", "sic_norway_old.csv"),
    delimiter=";", encoding="latin1",
)
industry_new = pd.read_csv(
    os.path.join(project_root, "data", "admin", "external", "sic_norway_new.csv"),
    delimiter=";",
)

industry_old["source"] = "old"
industry_new["source"] = "new"
industry_df = pd.concat([industry_old, industry_new], ignore_index=True)

# embed industry names and occupation titles+descriptions
model = SentenceTransformer("all-MiniLM-L6-v2")

occupation_texts = (
    (onet_df["Title"] + ". " + onet_df["Description"]).fillna("").tolist()
)
industry_texts = industry_df["name"].fillna("").tolist()

occupation_embeddings = model.encode(occupation_texts, convert_to_tensor=True)
industry_embeddings = model.encode(industry_texts, convert_to_tensor=True)

# for each industry, find the most similar occupation
best_matches = []
scores = []
for industry_emb in industry_embeddings:
    similarities = util.cos_sim(industry_emb, occupation_embeddings)
    sims = similarities.cpu().numpy().flatten()
    best_idx = np.argmax(sims)
    best_matches.append(onet_df.iloc[best_idx]["Title"])
    scores.append(sims[best_idx])

industry_df["best_match"] = best_matches
industry_df["similarity_score"] = scores

industry_df.to_csv(
    os.path.join(project_root, "data", "admin", "external", "industry_with_matches.csv"),
    index=False,
)
